library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(readr)
library(viridis)
## Loading required package: viridisLite
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
Note: there are only few individuals of the small ciliates because only cells bigger than 1000um2 are tracked at this magnification and most small ciliates (Tetrahymena, Dexiostoma and Loxocephalus) are smaller than that.
load("../data/16x/Coleps_viridis/Morph_mvt.RData")
dd16 <- morph_mvt
load("../data/16x/Coleps_irchel/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Didinium/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Euplotes/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Euplotes_second/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Paramecium_bursaria/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Paramecium_caudatum/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Dexiostoma/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Loxocephallus/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../data/16x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd16 <- rbind(dd16, morph_mvt)
## filtering
dd16 <- dd16 %>%
filter(net_disp>50, net_speed>5)
dd16$magnification <- 1.6
dd16$species <- ifelse(dd16$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),"Smaller_ciliates",dd16$species)
species <- unique(dd16$species)
compositions <- read_csv("../../compositions.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## composition = col_character()
## )
## See spec(...) for full column specifications.
compositions$Smaller_ciliates <- with(compositions,
ifelse(Tetrahymena == 1| Dexiostoma == 1| Loxocephallus == 1, 1, 0))
compositions <- compositions[,species]
compositions_list <- apply(compositions, 1, function(x){
idx <- which(x==1)
names(idx)
})
names(compositions_list) <- paste0("c_",c(paste0(0,1:9),10:16))
classifier_plot_svm <- function(table, combination.nr){
# cm <- classifier$confusion
cm <- table
ncol <- ncol(cm)
cm <- apply(cm, 1, function(x) x/sum(x))
cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
Classification = as.vector(cm))
plot <- cm_long %>%
ggplot(aes(Predicted,Truth,fill=Classification))+
geom_tile() +
geom_text(aes(label = round(Classification, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
theme(legend.text = element_text(size=14),
axis.title = element_text(size=14),
title = element_text(size=16),
axis.text = element_text(size=13))+
labs(title=paste("PPV of",combination.nr),fill="PPV")
return(plot)
}
classifier_assessment_plot <- function(cf, combination.nr){
cf.df <- cf$byClass[,1:4] %>%
as.data.frame() %>%
mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
"Sens" = "Sensitivity", "Spec" = "Specificity") %>%
pivot_longer(cols = 1:4) %>%
mutate(col = ifelse(value>=0.9,"1",
ifelse(value<0.9 & value>=0.8,"2",
ifelse(value<0.8 & value>=0.7,"3","4"))))
plot <- cf.df %>%
ggplot(aes(name,Species,fill=col))+
geom_tile() +
geom_text(aes(label = round(value, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
theme(legend.text = element_text(size=14),
title = element_text(size = 16),
axis.title = element_blank(),
axis.text = element_text(size=13),
legend.position = "none")+
labs(title=combination.nr, fill="")
return(plot)
}
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
sd_major + mean_ar + sd_ar +
mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed
trainingdata <- dd16[complete.cases(dd16), ]
set.seed(624)
trainingdata$species <- factor(trainingdata$species)
split_up <- split(trainingdata, f = trainingdata$species)
subsamples <- lapply(split_up, function(x) {
x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)
# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
p = 0.65, # percentage of data as training
list = FALSE)
testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
obj <- tune(svm, formula, data = trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=testdata %>% select(-species)),
testdata$species)
classifier_plot_svm(table = cf$table,
combination.nr = "all")
classifier_assessment_plot(cf = cf, combination.nr = "all")
There are mainly 4 measures:
Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)
Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)
Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)
Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)
PPV is the most important for us!
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
sd_major + mean_ar + sd_ar +
mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed
set.seed(62)
classifiers_18c_16x <- list()
classifiers_18c_16x_plots <- list()
classifiers_18c_16x_assess_plots <- list()
trainingdata <- dd16[complete.cases(dd16), ]
for(i in 1:length(compositions_list)){
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]]))
n.var <- length(unique(sub_trainingdata$species))
sub_trainingdata$species <- factor(sub_trainingdata$species)
split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
subsamples <- lapply(split_up, function(x) {
x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
sub_trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)
# A stratified random split of the data
idx_train <- createDataPartition(sub_trainingdata$species,
p = 0.7, # percentage of data as training
list = FALSE)
sub_testdata <- sub_trainingdata[-idx_train,]
sub_trainingdata <- sub_trainingdata[idx_train,]
n.min <- min(table(sub_trainingdata$species))
obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
classifiers_18c_16x[[i]] <- obj$best.model
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species)
classifiers_18c_16x_plots[[i]] <- classifier_plot_svm(table = cf$table,
combination.nr = names(compositions_list)[[i]])
classifiers_18c_16x_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}
names(classifiers_18c_16x_plots) <- names(classifiers_18c_16x) <-
names(classifiers_18c_16x_assess_plots) <- paste0("c_",c(paste0(0,1:9),10:16))
library(patchwork)
for(i in 1:16){
print(classifiers_18c_16x_plots[[i]] + classifiers_18c_16x_assess_plots[[i]] +
plot_layout(widths = c(4,2)))
}
saveRDS(classifiers_18c_16x, "svm_video_classifiers_18c_16x.rds")
# cl <- readRDS("classifiers_18c_16x.rds")